In [1]:
#Just list everything that might be used
import pandas as pd
import matplotlib.pyplot as plt
import pandas
import numpy as np
from zipfile import ZipFile
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import warnings;
warnings.simplefilter('ignore')

import plotly.express as px
import plotly.graph_objects as go
In [56]:
#get data
df = pd.read_csv('zameen-updated.csv')
display(df.head(3))
print('jumlah kolom kosong')
display(df.isna().sum())
print("datasource: kaggle")
property_id location_id page_url property_type price location city province_name latitude longitude baths area purpose bedrooms date_added agency agent Area Type Area Size Area Category
0 237062 3325 https://www.zameen.com/Property/g_10_g_10_2_gr... Flat 10000000 G-10 Islamabad Islamabad Capital 33.679890 73.012640 2 4 Marla For Sale 2 02-04-2019 NaN NaN Marla 4.0 0-5 Marla
1 346905 3236 https://www.zameen.com/Property/e_11_2_service... Flat 6900000 E-11 Islamabad Islamabad Capital 33.700993 72.971492 3 5.6 Marla For Sale 3 05-04-2019 NaN NaN Marla 5.6 5-10 Marla
2 386513 764 https://www.zameen.com/Property/islamabad_g_15... House 16500000 G-15 Islamabad Islamabad Capital 33.631486 72.926559 6 8 Marla For Sale 5 07-17-2019 NaN NaN Marla 8.0 5-10 Marla
jumlah kolom kosong
property_id          0
location_id          0
page_url             0
property_type        0
price                0
location             0
city                 0
province_name        0
latitude             0
longitude            0
baths                0
area                 0
purpose              0
bedrooms             0
date_added           0
agency           44071
agent            44072
Area Type            0
Area Size            0
Area Category        0
dtype: int64
datasource: kaggle
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168446 entries, 0 to 168445
Data columns (total 20 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   property_id    168446 non-null  int64  
 1   location_id    168446 non-null  int64  
 2   page_url       168446 non-null  object 
 3   property_type  168446 non-null  object 
 4   price          168446 non-null  int64  
 5   location       168446 non-null  object 
 6   city           168446 non-null  object 
 7   province_name  168446 non-null  object 
 8   latitude       168446 non-null  float64
 9   longitude      168446 non-null  float64
 10  baths          168446 non-null  int64  
 11  area           168446 non-null  object 
 12  purpose        168446 non-null  object 
 13  bedrooms       168446 non-null  int64  
 14  date_added     168446 non-null  object 
 15  agency         124375 non-null  object 
 16  agent          124374 non-null  object 
 17  Area Type      168446 non-null  object 
 18  Area Size      168446 non-null  float64
 19  Area Category  168446 non-null  object 
dtypes: float64(3), int64(5), object(12)
memory usage: 25.7+ MB
In [4]:
#filling NaN
df['agency'] = df['agency'].fillna('unknown')
df['agent'] = df['agent'].fillna('unknown')
df.head(3)
Out[4]:
property_id location_id page_url property_type price location city province_name latitude longitude baths area purpose bedrooms date_added agency agent Area Type Area Size Area Category
0 237062 3325 https://www.zameen.com/Property/g_10_g_10_2_gr... Flat 10000000 G-10 Islamabad Islamabad Capital 33.679890 73.012640 2 4 Marla For Sale 2 02-04-2019 unknown unknown Marla 4.0 0-5 Marla
1 346905 3236 https://www.zameen.com/Property/e_11_2_service... Flat 6900000 E-11 Islamabad Islamabad Capital 33.700993 72.971492 3 5.6 Marla For Sale 3 05-04-2019 unknown unknown Marla 5.6 5-10 Marla
2 386513 764 https://www.zameen.com/Property/islamabad_g_15... House 16500000 G-15 Islamabad Islamabad Capital 33.631486 72.926559 6 8 Marla For Sale 5 07-17-2019 unknown unknown Marla 8.0 5-10 Marla
In [5]:
#check if there's duplicated datas
df.duplicated().sum()
Out[5]:
0
In [6]:
#country code, maybe this'll useful soon? if not its alright
df["code"] = "PK"
df.head(3)
Out[6]:
property_id location_id page_url property_type price location city province_name latitude longitude ... area purpose bedrooms date_added agency agent Area Type Area Size Area Category code
0 237062 3325 https://www.zameen.com/Property/g_10_g_10_2_gr... Flat 10000000 G-10 Islamabad Islamabad Capital 33.679890 73.012640 ... 4 Marla For Sale 2 02-04-2019 unknown unknown Marla 4.0 0-5 Marla PK
1 346905 3236 https://www.zameen.com/Property/e_11_2_service... Flat 6900000 E-11 Islamabad Islamabad Capital 33.700993 72.971492 ... 5.6 Marla For Sale 3 05-04-2019 unknown unknown Marla 5.6 5-10 Marla PK
2 386513 764 https://www.zameen.com/Property/islamabad_g_15... House 16500000 G-15 Islamabad Islamabad Capital 33.631486 72.926559 ... 8 Marla For Sale 5 07-17-2019 unknown unknown Marla 8.0 5-10 Marla PK

3 rows × 21 columns

Handle Outliers¶

In [7]:
ax = sns.boxplot(data = df['price'])
In [8]:
ax = sns.boxplot(data = df[['baths', 'bedrooms']])
In [9]:
ax = sns.boxplot(data = df[['Area Size']])
In [10]:
#making function to replace outlier
def outlier (x):
    sorted(x)
    q1, q3 = x.quantile([0.25, 0.75])
    IQR = q3 - q1
    lwr_bound = q1 - (1.5*IQR)
    upr_bound = q3 + (1.5*IQR)
    return lwr_bound, upr_bound
In [11]:
low, high = outlier(df['price'])
low2, high2 = outlier(df['baths'])
low3, high3 = outlier(df['bedrooms'])
low4, high4 = outlier(df['Area Size'])

*this can be used too def cap_data(df): for col in df.columns: print("capping the ",col) if (((df[col].dtype)=='float64') | ((df[col].dtype)=='int64')): percentiles = df[col].quantile([0.01,0.99]).values df[col][df[col] <= percentiles[0]] = percentiles[0] df[col][df[col] >= percentiles[1]] = percentiles[1] else: df[col]=df[col] return df

final_df=cap_data(df)

In [12]:
#replacing outlier with upper bound and lower bound value
df['price'] = np.where(df['price']>high, high, df['price'])
df['price'] = np.where(df['price']<low, low, df['price'])

df['baths'] = np.where(df['baths']>high2, high2, df['baths'])
df['baths'] = np.where(df['baths']<low2, low2, df['baths'])

df['bedrooms'] = np.where(df['bedrooms']>high3, high3, df['bedrooms'])
df['bedrooms'] = np.where(df['bedrooms']<low3, low3, df['bedrooms'])

df['Area Size'] = np.where(df['Area Size']>high4, high4, df['Area Size'])
df['Area Size'] = np.where(df['Area Size']<low4, low4, df['Area Size'])
In [13]:
ax = sns.boxplot(data = df['price'])
In [14]:
ax = sns.boxplot(data = df[['baths', 'bedrooms']])
In [15]:
ax = sns.boxplot(data = df[['Area Size']])

Visualisasi¶

In [16]:
#price distribution
plt.figure(figsize=(10,5))
plt.hist(df['price'], bins=70, range=(1,2500000), color='green')
plt.title('Distribusi Harga Properti di Pakistan',fontsize=19, color='blue')
plt.xlabel('harga (ribu)', fontsize = 12, color='red')
plt.ylabel('Jumlah rumah',fontsize = 12, color='red')
plt.xlim(xmin=0,xmax=800000)
labels, locations = plt.xticks()
plt.xticks(labels, (labels/1000).astype(int))
plt.show()

Most houses/property have price less than 100000

In [17]:
plt.clf()
df.groupby('city')['price'].mean().sort_values(ascending=False).plot(kind='bar', color='blue', figsize=(8,5))
plt.title('Harga Properti per-Kota di Pakistan', loc='center',pad=30, fontsize=20, color='blue')
plt.xlabel('Kota', fontsize = 15, color='red')
plt.ylabel('Harga (seratus ribu)', fontsize = 15, color='red')
plt.ylim(ymin=0)
labels, locations = plt.yticks()
plt.yticks(labels, (labels/100000).astype(int))
plt.xticks(rotation=0)
plt.show()

Lahore City have high average house/property price.

In [18]:
plt.clf()
df.groupby('property_type')['price'].mean().sort_values(ascending=False).plot(kind='bar', color='blue', figsize=(8,5))
plt.title('Harga Properti per-Tipe di Pakistan', loc='center',pad=30, fontsize=20, color='blue')
plt.xlabel('Tipe Properti', fontsize = 15, color='red')
plt.ylabel('Harga (seratus ribu)', fontsize = 15, color='red')
plt.ylim(ymin=0)
labels, locations = plt.yticks()
plt.yticks(labels, (labels/100000).astype(int))
plt.xticks(rotation=20)
plt.show()

Farm House have the expensive price than other house/property type.

In [19]:
df.groupby(['city', 'property_type'])['property_id'].count().unstack().plot(kind='bar', figsize=(8,5))
plt.title('Jumlah Properti per-Tipe dan Kota di Pakistan',loc='center',pad=30, fontsize=20, color='blue')
plt.xlabel('Kota', fontsize = 12, color='red')
plt.ylabel('Jumlah (ribu)',fontsize = 12, color='red')
plt.legend(bbox_to_anchor=(1, 1), shadow=True, title='tipe')
plt.ylim(ymin=0)
labels, locations = plt.yticks()
plt.yticks(labels, (labels/1000).astype(int))
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()
In [55]:
df.groupby(['city', 'property_type'])['Area Size'].mean().unstack().plot(kind='bar', figsize=(8,5))
plt.title('Luas Rumah per-Tipe dan Kota di Pakistan',loc='center',pad=30, fontsize=20, color='blue')
plt.xlabel('Kota', fontsize = 12, color='red')
plt.ylabel('Luas',fontsize = 12, color='red')
plt.legend(bbox_to_anchor=(1, 1), shadow=True, title='Tipe')
plt.ylim(ymin=0)
labels, locations = plt.yticks()
plt.yticks(labels, (labels).astype(int))
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()
In [21]:
plt.clf()
df.groupby('property_type')['property_id'].count().sort_values(ascending=False).plot(kind='bar', color='blue', figsize=(8,5))
plt.title('Jumlah Properti Berdasarkan Tipenya di Pakistan', loc='center',pad=30, fontsize=20, color='blue')
plt.xlabel('Tipe Properti', fontsize = 15, color='red')
plt.ylabel('Jumlah', fontsize = 15, color='red')
plt.ylim(ymin=0)
labels, locations = plt.yticks()
plt.yticks(labels, (labels/100000).astype(int))
plt.xticks(rotation=20)
plt.show()

House is the most property in Pakistan

In [22]:
df.columns
Out[22]:
Index(['property_id', 'location_id', 'page_url', 'property_type', 'price',
       'location', 'city', 'province_name', 'latitude', 'longitude', 'baths',
       'area', 'purpose', 'bedrooms', 'date_added', 'agency', 'agent',
       'Area Type', 'Area Size', 'Area Category', 'code'],
      dtype='object')
In [23]:
df2 = df[['price', 'baths', 'bedrooms', 'Area Size', 'property_type']]
In [24]:
plt.figure(figsize=(20,24))
sns.set_theme('notebook', style='white')
sns.set_palette("bright")
g = sns.pairplot(df2, hue='property_type', diag_kind="hist", corner=True, palette = 'bright')
plt.show()
<Figure size 1440x1728 with 0 Axes>

Map (Peta)¶

In [25]:
df3 = df[(df['purpose']=='For Sale')]
In [26]:
df4 = df[(df['purpose']=='For Rent')]
In [53]:
import plotly.graph_objects as go
import pandas as pd
fig = px.scatter_mapbox(df3,
                        lat = df3['latitude'], 
                        lon = df3['longitude'],
                        center={'lat': 31.46500, 'lon': 73.76000},
                        zoom = 8,
                        color = df3['price'], 
                        size = df3['Area Size'],
                        color_continuous_scale=px.colors.cyclical.IceFire, 
                        width = 720,
                        height = 500,
                        title = 'Pakistan House Price Scatter Map (for sale)',
                        hover_name="city"
                        #labels = {'City':'city'}
                        #text=df["city"].astype(str),
                        #popup={'City':'city', 'Price': 'price', 'Size': 'Area Size', 'lat':'latitude', 'lon':'longitude'}
                       )
fig.update_layout(mapbox_style = 'open-street-map')
#fig.update_layout(coloraxis_colorscale='Viridis')
fig.update_layout(margin = {'r' : 0, 't': 50, 'l' : 0, 'b': 0})
fig.show()
print('The larger the area size, the larger the point size (in map)')
The larger the area size, the larger the point size (in map)
In [54]:
fig = px.scatter_mapbox(df4,
                        lat = df4['latitude'], 
                        lon = df4['longitude'],
                        center={'lat': 31.46500, 'lon': 73.76000},
                        zoom = 8,
                        color = df4['price'], 
                        size = df4['Area Size'],
                        color_continuous_scale=px.colors.cyclical.IceFire, 
                        width = 720,
                        height = 500,
                        title = 'Pakistan House Price Scatter Map (for rent)',
                        hover_name="city"
                        #labels = {'City':'city'}
                        #text=df["city"].astype(str),
                        #popup={'City':'city', 'Price': 'price', 'Size': 'Area Size', 'lat':'latitude', 'lon':'longitude'}
                       )
fig.update_layout(mapbox_style = 'open-street-map')
#fig.update_layout(coloraxis_colorscale='Viridis')
fig.update_layout(margin = {'r' : 0, 't': 50, 'l' : 0, 'b': 0})
fig.show()
print('The larger the area size, the larger the point size (in map)')
The larger the area size, the larger the point size (in map)

from this map, I detect some mistake in the data. Whether its the coordinates or the city name.

In [29]:
fig = px.scatter_mapbox(df,
                        lat = df['latitude'], 
                        lon = df['longitude'],
                        center={'lat': 31.46500, 'lon': 73.76000},
                        zoom = 8,
                        color = df['property_type'], 
                        #size = df4['price'],
                        #color_continuous_scale=px.colors.cyclical.IceFire, 
                        width = 720,
                        height = 500,
                        title = 'Pakistan Property Type Scatter Map',
                        hover_name="city"
                        #labels = {'City':'city'}
                        #text=df["city"].astype(str),
                        #popup={'City':'city', 'Price': 'price', 'Size': 'Area Size', 'lat':'latitude', 'lon':'longitude'}
                       )
fig.update_layout(mapbox_style = 'open-street-map')
fig.update_layout(coloraxis_colorscale='Viridis')
fig.update_layout(margin = {'r' : 0, 't': 50, 'l' : 0, 'b': 0})
fig.show()
print('plot complete and move up')
plot complete and move up

From maps above and visualization before, I can conclude why Lahore Have the highest property price. Its because this city has few properties for rent and many properties for sale and have lots of houses and a few flats, upper/lower portion, and room. Maybe Property price influenced by other variables that not listed in the data too.

Prediksi Harga Rumah¶

In [30]:
df5 = df.drop(['property_id','location_id', 'page_url', 'location', 'latitude','longitude','area','date_added','agency','agent','Area Category'],axis="columns")
In [31]:
df5.head()
Out[31]:
property_type price city province_name baths purpose bedrooms Area Type Area Size code
0 Flat 10000000.0 Islamabad Islamabad Capital 2.0 For Sale 2.0 Marla 4.0 PK
1 Flat 6900000.0 Islamabad Islamabad Capital 3.0 For Sale 3.0 Marla 5.6 PK
2 House 16500000.0 Islamabad Islamabad Capital 6.0 For Sale 5.0 Marla 8.0 PK
3 House 43500000.0 Islamabad Islamabad Capital 4.0 For Sale 4.0 Kanal 2.0 PK
4 House 7000000.0 Islamabad Islamabad Capital 3.0 For Sale 3.0 Marla 8.0 PK

Linear Regression¶

In [32]:
# split data 1
x = df5[['property_type', 'city', 'purpose', 'Area Size', 'baths', 'bedrooms']]
y = df5["price"]

# Apply logarithmic transformation to the target variable
y_log = np.log1p(y)

# split data 2
x_train, x_test, y_train, y_test = train_test_split(x, y_log, test_size=0.2, random_state=0)

# Scaling
scaler = StandardScaler()
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

# Preprocessing the input features
transformer = ColumnTransformer(transformers=[
    ('tnf1', OrdinalEncoder(categories=[['For Rent', 'For Sale']]), ['purpose']),
    ('tnf2', OneHotEncoder(sparse=False, drop='first'), ['property_type', 'city']),
    ('tnf3', 'passthrough', ['baths', 'bedrooms'])
], remainder='passthrough')

x_train_transformed = transformer.fit_transform(x_train)
x_test_transformed = transformer.transform(x_test)

# algoritma linear regression
lr = LinearRegression()
lr.fit(x_train_transformed, y_train_scaled)

# prediksi
y_pred_scaled = lr.predict(x_test_transformed)

# kembaliin y_pred!
y_pred = np.expm1(scaler.inverse_transform(y_pred_scaled))

print(y_pred)
[[10513225.22113776]
 [ 5735409.93628062]
 [ 7541761.94064282]
 ...
 [19276297.22845063]
 [  138985.19122532]
 [ 7399879.90387523]]
In [33]:
#validation
mean_absolute_error(y_test_scaled, y_pred_scaled)
Out[33]:
0.20625226567860655
In [52]:
# Buat DataFrame baru dengan ciri yang ingin diprediksi sendiri
new_data = pd.DataFrame({
    'property_type': ['Flat'],
    'city': ['Islamabad'],
    'purpose': ['For Sale'],
    'Area Size': [2],
    'baths': [3],
    'bedrooms': [3]
})

# Transformasikan data baru menggunakan transformer yang telah ada
new_data_transformed = transformer.transform(new_data)

#prediksi dengan model
y_pred_scaled = lr.predict(new_data_transformed)

# Kembalikan nilai prediksi ke skala semula
y_pred = np.expm1(scaler.inverse_transform(y_pred_scaled))

# Cetak prediksi harga rumah
print("prediksi harga rumah:", y_pred)
prediksi harga rumah: [[8494462.58337712]]